# 初始化一个浏览器实例
import time

import pymongo
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 解析页面模块
from bs4 import BeautifulSoup

browser = webdriver.Edge()

# 等待
wait = WebDriverWait(browser, 10)
# mongoDB数据库连接
client = pymongo.MongoClient(host='192.168.3.110', port=27017)
mongo = client.cnki # 库
collection = mongo.papers # 表

# 打开搜索关键字页面
def searcher(keyword):
    browser.get('https://www.cnki.net/')
    # browser.maximize_window()
    time.sleep(2)
    input = wait.until(
        EC.presence_of_element_located((By.ID, 'txt_SearchText'))
    )
    input.send_keys(keyword)
    wait.until(EC.presence_of_element_located(
        (By.CLASS_NAME, 'search-btn')
    )).click()
    time.sleep(3)
    # 定位每页显示数，并选择50
    wait.until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, '[class="icon icon-sort"]')
    )).click()
    wait.until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, '#id_grid_display_num ul li')
    ))[2].click()
    time.sleep(3)
    # 解析页面
    wait.until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, '.result-table-list tbody tr')
    ))  # 确保列表被完整加载了！

    # TODO:
    # next_page()
    parse_page()


# 翻到下一页，同时判断是否还有下一页，如果有，返回true,没有返回false
def next_page():
    try:
        time.sleep(2)
        page_next = wait.until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, '#Page_next_top'))
        )
    except TimeoutException:
        return False
    else:
        page_next.click()
        return True


# 解析页面，并保存如数据库！
def data_storage(paper):
    try:
        collection.insert_one(paper)
    except Exception as e:
        print('存数据出错：', e)


def parse_page():
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('.result-table-list tbody tr')
    # 遍历列表，提取信息
    for item in items:
        detail = item.select('td')
        paper = {
            'index': detail[0].text.strip(),
            'title': detail[1].text.strip(),
            'author': detail[2].text.strip(),
            'resource': detail[3].text.strip(),
            'time': detail[4].text.strip(),
            'database': detail[5].text.strip(),
        }
        print(paper)
        # 保存数据到数据库！
        data_storage(paper)


if __name__ == '__main__':
    # 入口，如果是页面执行入口
    # 1.定义一个搜索关键字
    keyword = 'Python'
    # 2.搜索关键字
    # 解析页面->保存数据
    searcher(keyword)
    time.sleep(5)
    # 3.循环
    while True:
        # 4. 调用点击下一页
        flag = next_page()
        if flag:
            # 检测是否完成，继续解析页面，没有下一页了。
            parse_page()
            continue
        else:
            break  # 结束循环

    browser.close()  # 关闭浏览器
